import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
data = pd.read_csv(r'C:\Users\gokul\OneDrive\Desktop\DATA DC\DC SEM 2\2204 Stat and Pred Modelling\Week_15\wireless_churn.csv')
data.head()
| AccountWeeks | ContractRenewal | DataPlan | DataUsage | CustServCalls | DayMins | DayCalls | MonthlyCharge | OverageFee | RoamMins | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 128 | 1 | 1 | 2.7 | 1 | 265.1 | 110 | 89.0 | 9.87 | 10.0 | 0 |
| 1 | 107 | 1 | 1 | 3.7 | 1 | 161.6 | 123 | 82.0 | 9.78 | 13.7 | 0 |
| 2 | 137 | 1 | 0 | 0.0 | 0 | 243.4 | 114 | 52.0 | 6.06 | 12.2 | 0 |
| 3 | 84 | 0 | 0 | 0.0 | 2 | 299.4 | 71 | 57.0 | 3.10 | 6.6 | 0 |
| 4 | 75 | 0 | 0 | 0.0 | 3 | 166.7 | 113 | 41.0 | 7.42 | 10.1 | 0 |
#Show Key Statistics
data.describe()
| AccountWeeks | ContractRenewal | DataPlan | DataUsage | CustServCalls | DayMins | DayCalls | MonthlyCharge | OverageFee | RoamMins | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 | 3333.000000 |
| mean | 101.064806 | 0.903090 | 0.276628 | 0.816475 | 1.562856 | 179.775098 | 100.435644 | 56.305161 | 10.051488 | 10.237294 | 0.144914 |
| std | 39.822106 | 0.295879 | 0.447398 | 1.272668 | 1.315491 | 54.467389 | 20.069084 | 16.426032 | 2.535712 | 2.791840 | 0.352067 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 74.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 143.700000 | 87.000000 | 45.000000 | 8.330000 | 8.500000 | 0.000000 |
| 50% | 101.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 179.400000 | 101.000000 | 53.500000 | 10.070000 | 10.300000 | 0.000000 |
| 75% | 127.000000 | 1.000000 | 1.000000 | 1.780000 | 2.000000 | 216.400000 | 114.000000 | 66.200000 | 11.770000 | 12.100000 | 0.000000 |
| max | 243.000000 | 1.000000 | 1.000000 | 5.400000 | 9.000000 | 350.800000 | 165.000000 | 111.300000 | 18.190000 | 20.000000 | 1.000000 |
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title = 'Wireless Churn Report')
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
#Prepare for Models Comparison
#Create x and y variables
x = data.drop('Churn', axis=1).to_numpy()
Y = data['Churn'].to_numpy()
#Load Library for Training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)
# Use built-in isolation forest
from sklearn.ensemble import IsolationForest
# The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
clf_all_features = IsolationForest(random_state=100)
clf_all_features.fit(x_train)
#Predict if a particular sample is an outlier using all features for higher dimensional data set.
y_pred_train = clf_all_features.predict(x_train)
y_pred_train2 =np.array(list(map(lambda x: x == 1, y_pred_train)))
# Exclude suggested outlier samples for improvement of prediction power/score
x_train_mod = x_train[y_pred_train2, ]
y_train_mod = y_train[y_pred_train2, ]
#Size of Datasets
print('Original Train Dataset Size : {}'.format(len(x_train)))
print('New Train Dataset Size : {}'.format(len(x_train_mod)))
Original Train Dataset Size : 2666 New Train Dataset Size : 2124
#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train_mod)
x_test2 = sc.fit_transform(x_test)
x_2 = sc.fit_transform(x)
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
#Construct some pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#Create Pipeline
pipeline =[]
pipe_lr = Pipeline([('scl', StandardScaler()),
('clf', LogisticRegression(solver= 'lbfgs', class_weight= 'balanced', max_iter= 1000, random_state=100))])
pipeline.insert(0,pipe_lr)
pipe_nb = Pipeline([('scl', StandardScaler()),
('clf', GaussianNB())])
pipeline.insert(1,pipe_nb)
# Set grid search params
modelpara =[]
param_gridlogistic = {'clf__C': [0.01, 0.1, 1, 10, 100],
'clf__penalty': ['l2'],
'clf__solver':['newton-cg', 'sag', 'saga' , 'lbfgs']}
modelpara.insert(0, param_gridlogistic)
param_gridnb = {} #Naive Bayes does not have any hyperparameters to tune
modelpara.insert(1, param_gridnb)
#Define Plot for learning curve
from sklearn.model_selection import learning_curve
def plot_learning_curves(model):
train_sizes, train_scores, test_scores = learning_curve(estimator=model,
X=x_train_mod,
y=y_train_mod,
train_sizes= np.linspace(0.1, 1.0, 10),
cv=10,
scoring='recall_weighted',random_state=100)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean,color='blue', marker='o',
markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std,
alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5,
label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std,
alpha=0.15, color='green')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.ylim([0.5, 1.01])
plt.show()
#Plot Learning Curve
print('Logistic Regression - Learning Curve')
plot_learning_curves(pipe_lr)
print('\nNaive Bayes - Learning Curve')
plot_learning_curves(pipe_nb)
Logistic Regression - Learning Curve
Naive Bayes - Learning Curve
#Define Gridsearch Function
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
def Gridsearch_cv(model, params):
#Cross-validation Function
cv2=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
#GridSearch CV
gs_clf = GridSearchCV(model, params, cv=cv2,scoring='recall_weighted')
gs_clf = gs_clf.fit(x_train_mod, y_train_mod)
model = gs_clf.best_estimator_
# Use best model and test data for final evaluation
y_pred = model.predict(x_test2)
#Identify Best Parameters to Optimize the Model
bestpara=str(gs_clf.best_params_)
#Output Heading
print('\n__________________________________________________________________________________________________')
print('\nOptimized Model')
print('\nModel Name:',str(pipeline.named_steps['clf']))
#Output Validation Statistics
print('\nBest Parameters:',bestpara)
print('\n', confusion_matrix(y_test,y_pred))
print('\n',classification_report(y_test,y_pred))
#Transform the variables into binary (0,1) - ROC Curve
from sklearn import preprocessing
Forecast1=pd.DataFrame(y_pred)
Outcome1=pd.DataFrame(y_test)
lb1 = preprocessing.LabelBinarizer()
OutcomeB1 =lb1.fit_transform(Outcome1)
ForecastB1 = lb1.fit_transform(Forecast1)
#Setup the ROC Curve
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
fpr, tpr, threshold = metrics.roc_curve(OutcomeB1, ForecastB1)
roc_auc = metrics.auc(fpr, tpr)
print('ROC Curve')
#Plot the ROC Curve
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
#Run Models
for pipeline, modelpara in zip(pipeline,modelpara):
Gridsearch_cv(pipeline,modelpara)
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn( c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
__________________________________________________________________________________________________
Optimized Model
Model Name: LogisticRegression(class_weight='balanced', max_iter=1000, random_state=100)
Best Parameters: {'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'sag'}
[[535 35]
[ 75 22]]
precision recall f1-score support
0 0.88 0.94 0.91 570
1 0.39 0.23 0.29 97
accuracy 0.84 667
macro avg 0.63 0.58 0.60 667
weighted avg 0.81 0.84 0.82 667
ROC Curve
__________________________________________________________________________________________________
Optimized Model
Model Name: GaussianNB()
Best Parameters: {}
[[185 385]
[ 9 88]]
precision recall f1-score support
0 0.95 0.32 0.48 570
1 0.19 0.91 0.31 97
accuracy 0.41 667
macro avg 0.57 0.62 0.40 667
weighted avg 0.84 0.41 0.46 667
ROC Curve
#Create Voting Model - Sklearn
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier
estimators = []
mod1 = LogisticRegression(solver= 'lbfgs', class_weight= 'balanced', max_iter= 1000, random_state=100)
estimators.append(('LogisticRegression', model1))
mod2 = GradientBoostingClassifier(random_state=100)
estimators.append(('GradientBoostingClassifier', model2))
mod3 = GaussianNB()
estimators.append(('GaussianNB', model3))
voting_clf=VotingClassifier(estimators,voting='soft')
scoring = {'acc': 'accuracy',
'prec_macro': 'precision_macro',
'rec_macro': 'recall_macro'}
print('\nVoting Model')
for clf in (mod1,mod2,mod3,voting_clf):
rkfcv= clf.fit(x_train2,y_train_mod)
ens_rkf1 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
rKFcv = cross_validate(rkfcv, x_2, Y, scoring=scoring, cv=ens_rkf1)
print(clf.__class__.__name__,round(rKFcv['test_rec_macro'].mean(),2))
Voting Model LogisticRegression 0.76 GradientBoostingClassifier 0.82 GaussianNB 0.67 VotingClassifier 0.81